Content
- What is R and who uses R?
- My R journey
- Tidyverse approach vs base-R
- Fun use cases implemented with R
- Web application in 5 minutes
How developers play board games?
R is a system for statistical computation and graphics. It provides, among other things, a programming language, high level graphics, interfaces to other languages and debugging facilities.
“The best thing about R is that it was written by statisticians. The worst thing about R is that it was written by statisticians.”
~20% of base R is written in R.
https://h2oai.github.io/db-benchmark/groupby.html (Hardware: 20 x 2,6 GHz CPU, 126 GB DDR4 RAM)
nrow(available.packages())Even more packages on GitHub - 64615
Python is king, but R kernels are more likely to win medals in Kaggle
Where I use R:
TOO MANY OPTIONS
summary(mydata$x)
summary(mydata$"x")
summary(mydata["x"])
summary(mydata[,"x"])
summary(mydata[["x"]])
summary(mydata[1])
summary(mydata[,1])
summary(mydata[[1]])
with(mydata, summary(x))
attach(mydata)
summary(x)
summary(subset(mydata, select=x))TOO MANY OBJECTS (and a messy code)
my_data_object <- read.csv("a_messy_csv_file.csv")
my_data_object$string <- as.character(my_data_object$string)
my_data_object$mean_of_smth <- mean(my_data_object$some_numeric_variable)
colnames(my_data_object) <- c("random name", "test","average", "b")
filtered_data <- my_data_object[!is.na(my_data_object$b) & my_data_object$b > 10,]
lets_filter_one_more_time <- filtered_data[!is.na(filtered_data$test),]
lets_randomly_create_new_data_object <- lets_filter_one_more_time[, 3]Solution - the tidyverse approach!
The tidyverse is an opinionated collection of R packages designed for data science. All packages share an underlying design philosophy, grammar, and data structures.
your_data %>%
select(Variable, Size, Random, test) %>%
filter(Size == "Large") %>%
mutate(test2 = substr(test, 1, 2)) %>%
group_by(test2) %>%
summarize(avg_variable = mean(Variable)) %>%
arrange(avg_variable)tidyverse_packages()## [1] "broom" "cli" "crayon" "dplyr" "dbplyr"
## [6] "forcats" "ggplot2" "haven" "hms" "httr"
## [11] "jsonlite" "lubridate" "magrittr" "modelr" "purrr"
## [16] "readr" "readxl\n(>=" "reprex" "rlang" "rstudioapi"
## [21] "rvest" "stringr" "tibble" "tidyr" "xml2"
## [26] "tidyverse"
Connect to the Data Lake:
con <- dbConnect(odbc(), "ODBC_driver_name")Connect to RDMS:
con <- dbConnect(odbc(),
Driver = "SQL Server",
Server = "production_server",
Database = "jjj",
UID = "yyy",
PWD = "xxx",
Port = 1433)That’s it!
dbGetQuery(con, "SHOW DATABASES;")
r_table <- dbGetQuery(con, "SELECT * FROM SOME_TABLE;")
# or
dbplyr_table <- tbl(con, "SOME_TABLE")Connecting to Spark via Jupyter notebook:
Other options:
sparklyr - another R interface for Spark
Data profiling using SparkR interface:
rkafka library for Kafka interface in R:
library(rkafka)Consume Kafka messages (create JAVA object):
consumer <- rkafka.createConsumer(zookeeperConnect = "xxxxx:8888",
topicName = "gasprices_full",
groupId = "group_id",
autoCommitEnable = "false",
autoOffsetReset = "smallest")Read messages:
rkafka.readPoll(consumer)
rkafka.read(consumer)Parse received data (jsonlite):
df <- data.frame(stringsAsFactors = FALSE)
for (i in (1:length(prices))){
message <- prices[i] %>%
fromJSON(flatten = TRUE)
df <- bind_rows(df, c(Date = message$Date, Price = message$Price))
}Tidy data (tidyr):
gas_prices <- df %>%
as_tibble() %>%
mutate(Date = as.Date(Date), Price = as.numeric(Price)) %>%
complete(Date = seq.Date(min(Date), max(Date), by="day")) %>%
fill(Price) %>%
filter(Date > "2016-01-01")“Data science” in 5 minutes:
ts <- tk_ts(gas_prices, start = 2016, frequency = 365, silent = TRUE)
fit_ts <- ts %>%
HoltWinters()
fcast_ts <- fit_ts %>%
forecast(h = 180)initiatives <- tbl(con, in_schema("jiraschema", "jiraissue")) %>%
filter(PROJECT %in% c("aaaa", "bbb", "ccccc") & issuetype == "xxx") %>%
left_join(select(tbl(con, in_schema("jiraschema", "issuetype")), pname, ID), by = c("issuetype" = "ID")) %>%
rename(issue_type = pname) %>%
left_join(select(tbl(con, in_schema("jiraschema", "issuestatus")), pname, ID), by = c("issuestatus" = "ID")) %>%
rename(issue_status = pname) %>%
left_join(select(tbl(con, in_schema("jiraschema", "PROJECT")), pname, ID, pkey), by = c("PROJECT" = "ID")) %>%
rename(project_name = pname, project_key = pkey.y) %>%
collect()Dealing with Portfolio management plugin data:
JSON structure:
Get all solution related entries:
SOLUTION <- tbl(con, in_schema("jiraschema", "AO_D9132D_SOLUTION")) %>%
filter(str_detect(SOLUTION, '"xxxxx"')) %>%
collect()Create a dataframe for issue links/dependencies:
links <- lastsolution$solution$hierarchy %>%
melt() %>%
mutate(type = "hierarchy") %>%
union_all(lastsolution$solution$issueDependents %>%
melt() %>%
mutate(type = "issueDependents")) %>%
rename(from = value, to = L1)Network visualization using JavaScript libraries:
library(visNetwork)
visnet <- visNetwork(vis.nodes, vis.links) %>%
...Customer review classification in Lithuanian language
REST API response example:
Creating TF model in R:
model <- keras_model_sequential() %>%
layer_embedding(input_dim = c(max_words), output_dim = 128,
input_length = c(maxlen)) %>%
layer_flatten() %>%
layer_dense(units = 3, activation = "softmax") %>%
compile(
optimizer = "adam",
loss = "categorical_crossentropy",
metrics = c("acc")
)Deploying the model:
library(tfdeploy)
serve_savedmodel("final_model", host = "127.0.0.1", port = 8089)Creating REST API for data entry:
#* @get /version
model_version <- function() {
result <- data.frame(
"status" = 200,
"api_version" = VERSION
)
return(result)
}Running REST API:
library(plumber)
r <- plumb("API.R")
r$run(port=8000)Problem no. 1 - track testing activities
Problem no. 2 - monitoring activities
Problem no. 3 - anomaly detection in the server logs
Started by user [8maha:////4LPCl/hG3BqvHgSrKRfoOIjnkAUEcTIfAH4/u4RYPGaasasassa9b85aBtbiIQTGjNKU4P08vOT+vOD8nVc83PyU1x6OyILUoJzMv2y+/JJUBAhiZGBgqihhk0NSjKDWzXb3RdlLBUSYGJk8GtpzUvPSSDB8G5tKinBIGIZ+sxLJE/ZzEvHT94JKizLx0a6BxUmjGOUNodHsLgAz+EgYe/dLi1CL9YnMDS8MkAEL/AdzBAAAA[0mTestuotojas Testuotojas
Building in workspace /opt/app/jenkins/workspace/XXX tests/XXXXX
> git rev-parse --is-inside-work-tree # timeout=10
Fetching changes from the remote Git repository[2018-11-01 02:08:37,633] INFO [GroupMetadataManager brokerId=1003] Removed 0 expired offsets in 1 milliseconds. (kafka.coordinator.group.GroupMetadataManager)
[2018-11-01 02:10:20,126] INFO [GroupCoordinator 1003]: Member consumer-105-53acxxxd95-7a99-4f87-a382-1df851415401 in group xxxxxxxxx has failed, removing it from the group (kafka.coordinator.group.GroupCoordinator)THE START OF A DATA JOURNEY
tail -f game.json | nc -lk 445…
[{ "gameId": 3, "name": "gameStart", "time": 1541957192}]
[{ "gameId": 3, "combinationId": 0, "name": "whiteStartedMoving", "x": 0.028963912, "y": 0.8928387, "time": 1541957205}]
[{ "gameId": 3, "combinationId": 0, "name": "combinationStart", "time": 1541957220}]
[{ "gameId": 3, "combinationId": 0, "name": "whiteStartPosition", "x": 0.44422802, "y": 0.56917685, "time": 1541957220}]…
ggplot2 to the resque!
ggplot() +
annotation_custom(rasterGrob(pool_img,
width = unit(1, "npc"),
height = unit(1,"npc")),
xmin = -0.09,
xmax = 1.09,
ymin = -0.29,
ymax = 1.29) +
geom_point(data = white_ball, aes(x = x, y = y), size = 15, color = "white") +
geom_point(data = df, aes(x = x, y = y), size = 2, linetype = 2, color = "blue") +
geom_path(data = balls, aes(x = x, y = y, color = as.factor(ball_id)), size = 15, show.legend = FALSE) +
scale_color_manual(values=c("lightblue", "blue")) +
geom_text(data = balls, aes(x = x, y = y, label = ball_id), size = 5) +
ylim(0, 1) +
xlim(0, 1) +
theme_transparent()Reactive polling in order to observe the database without refreshing the app:
data <- reactivePoll(500, session,
checkFunc = function() {
(dbGetQuery(conn, paste0('SELECT id FROM ',
table_name,
' ORDER BY id DESC LIMIT 1'))
)
},
valueFunc = function() {
(dbReadTable(conn, table_name) %>%
filter(game_id == dbGetQuery(conn, paste0('SELECT game_id from ',
table_name,
' ORDER BY time DESC LIMIT 1'))[1,1]) %>%
mutate(x = x, y = 1 - y)
)
}
)Shiny is a new package from RStudio that makes it incredibly easy to build interactive web applications with R.
ui.R
fluidPage(
sidebarLayout(
sidebarPanel(
sliderInput("obs", "Number of observations:", min = 10, max = 500, value = 100)
),
mainPanel(plotOutput("distPlot"))
)
)server.R
function(input, output) {
output$distPlot <- renderPlot({
hist(rnorm(input$obs), col = 'darkgray', border = 'white')
})
}sudo cp -R /home/your_user/your_directory/* /srv/shiny-server/your_appinstall.packages("tidyverse")
install.packages("rmarkdown")
install.packages("shiny")?tidyverse
?shiny
?rmarkdown